Importación de paquetes y carga de .csv
import pandas as pd
import plotly.graph_objects as go
from plotly import offline
offline.init_notebook_mode()
Y_NAME = 'IsCanceled'
CSV_PATH = 'hotusa_cancellations.csv'
# Eliminación de espacios en el archivo .csv para parsear correctamente
# las variables
with open(CSV_PATH, 'r') as filein:
lines = filein.readlines()
new_lines = [line.replace(' ', '') for line in lines]
with open(CSV_PATH, 'w') as fileout:
fileout.writelines(new_lines)
data = pd.read_csv(CSV_PATH, parse_dates=['ReservationStatusDate','ArrivalDate'])
data
Exploración básica del DataFrame
data.info()
data.describe()
Valores perdidos y eliminación de variables
display(data.isna().sum() / len(data))
# Eliminación de variable 'Company' por tner el 91% de valores nulos.
# Eliminación del primer campo sin identificación o referencias en la documentación
# Eliminación del HotelId, ya que es una constante
data.drop(['Company', 'Unnamed: 0', 'HotelId'], axis=1, inplace=True)
ReservationStatusDate
aux = data.sort_values(by='ReservationStatusDate')
aux = aux[['ReservationStatusDate', Y_NAME]].groupby('ReservationStatusDate')[Y_NAME].sum()
fig = go.Figure([
go.Bar(
x=aux.index,
y=aux
)
])
fig.update_layout(title='Recuento de cancelaciones por fecha de reserva',
yaxis_title='Recuento',
template='plotly_white')
fig.show()
ArrivalDate
aux = data.sort_values(by='ArrivalDate')
aux = aux[['ArrivalDate', Y_NAME]].groupby('ArrivalDate')[Y_NAME].sum()
fig = go.Figure([
go.Bar(
x=aux.index,
y=aux
)
])
fig.update_layout(title='Recuento de cancelaciones por fecha de llegada',
yaxis_title='Recuento',
template='plotly_white')
fig.show()
LeadTime
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[data[Y_NAME], 'LeadTime'], name='Cancelado'))
fig.add_trace(go.Box(x=data.loc[~data[Y_NAME], 'LeadTime'], name='No Cancelado'))
fig.update_layout(template='plotly_white', xaxis_title='Días de antelación de reserva')
fig.show()
StaysInWeekendNights
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[data[Y_NAME], 'StaysInWeekendNights'], name='Cancelado'))
fig.add_trace(go.Box(x=data.loc[~data[Y_NAME], 'StaysInWeekendNights'], name='No Cancelado'))
fig.update_layout(template='plotly_white', xaxis_title='Número de noches en fin de semana')
fig.show()
StaysInWeekNights
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[data[Y_NAME], 'StaysInWeekNights'], name='Cancelado'))
fig.add_trace(go.Box(x=data.loc[~data[Y_NAME], 'StaysInWeekNights'], name='No Cancelado'))
fig.update_layout(template='plotly_white', xaxis_title='Número de noches en semana')
fig.show()
Adults
cross = pd.crosstab(data[Y_NAME], data['Adults'])
pd.DataFrame({
'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Children
cross = pd.crosstab(data[Y_NAME], data['Children'])
pd.DataFrame({
'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
CustomerType
cross = pd.crosstab(data[Y_NAME], data['CustomerType'])
pd.DataFrame({
'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
ADR
fig = go.Figure()
fig.add_trace(go.Box(x=data.loc[data[Y_NAME], 'ADR'], name='Cancelado'))
fig.add_trace(go.Box(x=data.loc[~data[Y_NAME], 'ADR'], name='No Cancelado'))
fig.update_layout(template='plotly_white', xaxis_title='Precio medio por noche')
fig.show()
Meal
cross = pd.crosstab(data[Y_NAME], data['Meal'])
pd.DataFrame({
'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
Country
# Ordenado por frecuencia de cancelación
cross = pd.crosstab(data[Y_NAME], data['Country'])
pd.DataFrame({
'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
# Ordenado por frecuencia de país
cross = pd.crosstab(data[Y_NAME], data['Country'])
pd.DataFrame({
'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
'total': cross.loc[True] + cross.loc[False]
}).sort_values(['total', 'relative'], ascending=False)
ReservedRoomType
cross = pd.crosstab(data[Y_NAME], data['ReservedRoomType'])
pd.DataFrame({
'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
IsRepeatedGuest
cross = pd.crosstab(data[Y_NAME], data['IsRepeatedGuest'])
pd.DataFrame({
'relative': cross.loc[True] / (cross.loc[True] + cross.loc[False]),
'total': cross.loc[True] + cross.loc[False]
}).sort_values(['relative', 'total'], ascending=False)
IsCancelled
fig = go.Figure([
go.Bar(
x=data[Y_NAME].value_counts().index,
y=data[Y_NAME].value_counts()
)
])
fig.update_layout(title=f'Barras de frecuencia para { Y_NAME }',
yaxis_title='Recuento')
fig.show()